import pandas
import numpy
import random
import datetime
import os
import math
#Read in data from excel or csv using read_excel or read_csv method
#If read_excel is used make sure you include sheet_name parameter
datapath = "/users/danielcorcoran/desktop/github_repos/\
python_nb_data_profiling/test_data/Sample - Superstore.xls"
data = pandas.read_excel(datapath, sheet_name = "Orders")
print("Dataframe dimensions are {}".format(data.shape))
exportpath = "/users/danielcorcoran/desktop/github_repos/python_nb_data_profiling/exports/"
report_width = 80
spacing_char = " "
# filters out cetgorical features to visualize
# features with unique members in between this range will be plotted
nunique_range = [2,7]
#palettes can be set to a list of custom html colours or predefined sets from seaborn library
palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
# Sets the type of visualization for non identity pairings
# types available 'kde' (kernel density), 'hist' (histogram)
identity_type = 'kde'
# Sets the type of visualization for non identity pairings
# types available 'scatter', 'reg' (regression)
non_identity_type = 'scatter'
seaborn.set_palette?
def generate_dynamic_random_code(code_length):
characters_string = "abcdefghijklmnopqrstuvwxyz1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"
listchar = list(characters_string)
placeholder_string = "-" * code_length
placeholders = list(placeholder_string)
for index in range(len(placeholders)):
random_index = random.randint(0,len(listchar) - 1)
character = listchar[random_index]
placeholders[index] = character
code = "".join(placeholders)
return code
def generate_float_frame(row_count, column_count, assign_col_names = False, nullperc = 0.04):
data = pandas.DataFrame(numpy.random.random((row_count, column_count)))
if assign_col_names == True:
columns = []
while len(columns) < column_count:
columns.append(generate_dynamic_random_code(8))
data.columns = columns
if nullperc < 0 or nullperc >100:
nullperc = 0.2
print("percentage of nulls defaulted to 0.2")
total_cells = data.shape[0] * data.shape[1]
null_cells = int(total_cells * nullperc)
for iteration in range(null_cells):
r_index = random.randint(0,row_count - 1)
c_index = random.randint(0, column_count - 1)
data.iloc[r_index, c_index] = None
return data
generate_float_frame¶data = generate_float_frame(row_count = 400, #dataframe to have 400 rows
column_count = 6, # dataframe to have 6 columns
assign_col_names = True, # assign random strings as column names
nullperc = 0.05) # approx 5% dataframe has null cells
#call pandas describe method on dataframe
describe = data.describe(include = "all")
#transpose
describe_transposed = describe.T
#reset_index, moving the column names into a new series
describe_final = describe_transposed.reset_index()
describe_final
# get datatypes of each feature
dtypes = data.dtypes
#convert to dataframe and rename a column
dtypes_dataframe = pandas.DataFrame(dtypes, columns = ["data_type"])
# reset index, moving the column names into a new series
dtypes_final = dtypes_dataframe.reset_index()
# preview
dtypes_final
#use notnull() and isnull() methods combined with sum() to
#get null composition of dataset
filled_count_series = data.notnull().sum().astype(int)
null_count_series = data.isnull().sum().astype(int)
#get amount of rows in dataset
totalcount = data.shape[0]
#create percentage series based on series above
percent_null = null_count_series / totalcount
percent_filled = filled_count_series / totalcount
percent_null = percent_null.reset_index().rename(columns = {0:"null_percent"})
percent_filled = percent_filled.reset_index().rename(columns = {0:"non_null_percent"})
filled_count_series = filled_count_series.reset_index().rename(columns = {0:"non_null_counts"})
null_count_series = null_count_series.reset_index().rename(columns = {0:"null_counts"})
from functools import reduce
null_info_dataframe = reduce(lambda left,right: pandas.merge(left,right,on='index'), [percent_null,
percent_filled,
filled_count_series,
null_count_series])
null_info_dataframe
merged = reduce(
lambda left, right: pandas.merge(left, right, on='index', how="left"),
[null_info_dataframe, dtypes_final, describe_final])
merged["row_count"] = totalcount
merged.drop(["count"], axis = 1, inplace = True)
merged
merged["data_type"] = merged["data_type"].astype(str)
rounded_places = 4
for column in ["mean", "std", "min", "25%", "50%", "75%", "max"]:
merged[column] = merged[column].astype(float).round(rounded_places)
profile_dict = {}
#get list of colunmns other than 'index' column
merged_attributes = list(merged.columns)[1:]
just_width = math.floor(report_width / 2)
number_rows = merged.shape[0]
for index in range(number_rows):
column = merged.loc[index, "index"]
column_string = " {} ({}/{}) ".format(column, index + 1, number_rows)
#print feature name
print("\n\n", column_string.center(report_width, "-"), "\n", sep="")
#create empty list per key (column name)
profile_dict[column] = []
#print feature stats iteratively
for attr in merged_attributes:
attr_ = attr + ":"
value = merged.loc[index, attr]
value_string = str(value)
if "percent" in attr:
print(
attr_.ljust(just_width, spacing_char),
"{0:.2}".format(value).rjust(just_width, spacing_char),
sep="")
else:
print(
attr_.ljust(just_width, spacing_char),
value_string.rjust(just_width, spacing_char),
sep="")
profile_dict[column].append({attr: value})
# will try to convert a value to a float, followed by string,
# if the float conversion fails
def convert_invalid_values(value):
try:
new_value = float(value)
except:
new_value = str(value)
return new_value
with open(exportpath + "profile.json", "w") as fileobj:
# for errors I want to attempt to convert the value to a float before str using
# the function defined above
json.dump(profile_dict, fileobj, default = convert_invalid_values)
fileobj.close()
merged.to_csv(exportpath + "profile.csv", index = False)
(Optional)
import seaborn
import matplotlib.pyplot as plt
seaborn.set(font_scale=4)
seaborn.set_style("whitegrid")
data_no_nulls = data.dropna(how = "any",
axis = 0)
data_no_nulls.columns
seaborn.pairplot(data, hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None, kind='scatter', diag_kind='hist', markers=None, size=2.5, aspect=1, dropna=True, plot_kws=None, diag_kws=None, grid_kws=None)
#create subset of merged containing data type = object and nunique between range above
#if we have an object column with categories do this
if "unique" in merged.columns:
subset_merged = list(merged[(merged["unique"] >= nunique_range[0]) &
(merged["unique"] <= nunique_range[1]) &
(merged["data_type"] == "object")]["index"])
#preview categorical features
print(subset_merged)
for column in subset_merged:
plt.figure()
myplot = seaborn.pairplot(data = data_no_nulls,
kind = non_identity_type,
diag_kind = identity_type,
hue = column,
palette = palette, size = 8
)
myplot.fig.suptitle("Pairplot categorized by {}".format(column),
y = 1.03)
plt.savefig(exportpath + column + "_pairplot.png")
#if we only have floats in our data do this
else:
plt.figure()
myplot = seaborn.pairplot(data = data_no_nulls,
kind = non_identity_type,
diag_kind = identity_type, size = 8
)
myplot.fig.suptitle("Pairplot",
y = 1.03)
plt.savefig(exportpath + "data_profile_pairplot.png")